Data Preparation Steps
rider = read.csv("AmtrakBig_CA_Question-3.csv", stringsAsFactors = F)
glimpse(rider)
## Observations: 159
## Variables: 4
## $ Month <chr> "Jan-05", "Feb-05", "Mar-05", "Apr-05", "May-05", "Jun…
## $ Ridership <int> 1709, 1621, 1973, 1812, 1975, 1862, 1940, 2013, 1596, …
## $ t <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ Season <chr> "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug"…
rider$Date = strptime(paste("01", rider$Month), "%d %b-%y")
rider$Year = format(rider$Date, "%Y")
table(rider$Year, rider$Season)
##
## Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
## 2005 1 1 1 1 1 1 1 1 1 1 1 1
## 2006 1 1 1 1 1 1 1 1 1 1 1 1
## 2007 1 1 1 1 1 1 1 1 1 1 1 1
## 2008 1 1 1 1 1 1 1 1 1 1 1 1
## 2009 1 1 1 1 1 1 1 1 1 1 1 1
## 2010 1 1 1 1 1 1 1 1 1 1 1 1
## 2011 1 1 1 1 1 1 1 1 1 1 1 1
## 2012 1 1 1 1 1 1 1 1 1 1 1 1
## 2013 1 1 1 1 1 1 1 1 1 1 1 1
## 2014 1 1 1 1 1 1 1 1 1 1 1 1
## 2015 1 1 1 1 1 1 1 1 1 1 1 1
## 2016 1 1 1 1 1 1 1 1 1 1 1 1
## 2017 1 1 1 1 1 1 1 1 1 1 1 1
## 2018 0 0 0 1 1 0 0 1 0 0 0 0
# There is no missing gap in the dataset
rider_ts = ts(rider$Ridership, frequency = 12, start = c(2005, 1))
is.ts(rider_ts)
## [1] TRUE
rider_ts
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2005 1709 1621 1973 1812 1975 1862 1940 2013 1596 1725 1676 1814
## 2006 1615 1557 1891 1956 1885 1623 1903 1997 1704 1810 1862 1875
## 2007 1705 1619 1837 1957 1917 1882 1933 1996 1673 1753 1720 1734
## 2008 1563 1574 1903 1834 1831 1776 1868 1907 1686 1779 1776 1783
## 2009 1548 1497 1798 1733 1772 1761 1792 1875 1571 1647 1673 1657
## 2010 1382 1361 1559 1608 1697 1693 1836 1943 1551 1687 1576 1700
## 2011 1397 1372 1708 1655 1763 1776 1934 2008 1616 1774 1732 1797
## 2012 1570 1413 1755 1825 1843 1826 1968 1922 1670 1791 1817 1847
## 2013 1599 1549 1832 1840 1846 1865 1966 1949 1607 1804 1850 1836
## 2014 1542 1617 1920 1971 1992 2010 2054 2097 1824 1977 1981 2000
## 2015 1683 1663 2008 2024 2047 2073 2127 2203 1708 1951 1974 1985
## 2016 1760 1771 2020 2048 2069 1994 2075 2027 1734 1917 1858 1996
## 2017 1778 1749 2066 2099 2105 2130 2223 2174 1931 2121 2076 2141
## 2018 1832 1838 2132
# acf(rider_ts) ; pacf(rider_ts)
#ggplot(NULL, aes(y = rider_ts, x = seq_along(rider_ts))) + geom_line()+geom_point()
# Split TS data for Train and Test (year 2016, 2017 & 2018 Jan-Mar)
train = subset(rider_ts, end=length(rider_ts)-27)
test = subset(rider_ts, start=length(rider_ts)-26)
autoplot(rider_ts) ; cycle(rider_ts)

## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2005 1 2 3 4 5 6 7 8 9 10 11 12
## 2006 1 2 3 4 5 6 7 8 9 10 11 12
## 2007 1 2 3 4 5 6 7 8 9 10 11 12
## 2008 1 2 3 4 5 6 7 8 9 10 11 12
## 2009 1 2 3 4 5 6 7 8 9 10 11 12
## 2010 1 2 3 4 5 6 7 8 9 10 11 12
## 2011 1 2 3 4 5 6 7 8 9 10 11 12
## 2012 1 2 3 4 5 6 7 8 9 10 11 12
## 2013 1 2 3 4 5 6 7 8 9 10 11 12
## 2014 1 2 3 4 5 6 7 8 9 10 11 12
## 2015 1 2 3 4 5 6 7 8 9 10 11 12
## 2016 1 2 3 4 5 6 7 8 9 10 11 12
## 2017 1 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3
autoplot(train) ; cycle(train)

## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2005 1 2 3 4 5 6 7 8 9 10 11 12
## 2006 1 2 3 4 5 6 7 8 9 10 11 12
## 2007 1 2 3 4 5 6 7 8 9 10 11 12
## 2008 1 2 3 4 5 6 7 8 9 10 11 12
## 2009 1 2 3 4 5 6 7 8 9 10 11 12
## 2010 1 2 3 4 5 6 7 8 9 10 11 12
## 2011 1 2 3 4 5 6 7 8 9 10 11 12
## 2012 1 2 3 4 5 6 7 8 9 10 11 12
## 2013 1 2 3 4 5 6 7 8 9 10 11 12
## 2014 1 2 3 4 5 6 7 8 9 10 11 12
## 2015 1 2 3 4 5 6 7 8 9 10 11 12
autoplot(test) ; cycle(test)

## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2016 1 2 3 4 5 6 7 8 9 10 11 12
## 2017 1 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3
Methods Used
# Lag plots
gglagplot(rider_ts)

# There is evidence of seasonality at lag 12.
# Try seasonal differencing at lag 12
# Visualise the various components of TS
decompose(rider_ts) %>% plot()

# or plot(stl(rider_ts, s.window="periodic"))
# There is a downward trend from yer 2006 to 2010 and turned positive trend
# Try differencing by the order of d=1. Also try D=1 for seasonal effect
# Try AR(1)
# Try MA(1)
# Train dataset
# Making Trend Stationary with Differencing
train %>%
diff() %>%
ggtsdisplay(smooth = T)

train %>%
diff() %>%
diff(lag = 12) %>% # additional seasonal differencing at lag 12
ggtsdisplay(smooth = T)

# ACF improved with additional seasonal differencing at lag 12.
# Have d=1 and D=1 for ARIMA
# Keeping the TS after Trend is made stationary with differencing
train010_010 = train %>% diff() %>% diff(lag = 12)
plot(train010_010)

# Augmented Dickey-Fuller Test
# Prior to Differencing
adfTest(train)
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 1
## STATISTIC:
## Dickey-Fuller: -0.1768
## P VALUE:
## 0.5575
##
## Description:
## Tue Sep 3 00:22:05 2019 by user:
# After Differencing
adfTest(train010_010)
## Warning in adfTest(train010_010): p-value smaller than printed p-value
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 1
## STATISTIC:
## Dickey-Fuller: -10.4791
## P VALUE:
## 0.01
##
## Description:
## Tue Sep 3 00:22:05 2019 by user:
# p-value <0.05,
# we reject the NULL hypothesis that it is no-stationary time series
# AR: Using arima p=1 (non-seasonal)
model_100 = Arima(train010_010, order=c(1,0,0))
checkresiduals(model_100)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 37.99, df = 22, p-value = 0.01837
##
## Model df: 2. Total lags used: 24
tsdisplay(residuals(model_100)) # showing the PACF

# The PACF spike in lag 1 is reduced, but the spike at lag 12 still exist.
# ggfortify::ggtsdiag(model_100)
# Ljung-Box Statistic still showing the residuals p-value <0.05,
# rejecting the NULL Hypothesis,
# implying residual data are not independently distributed; they exhibit serial correlation.
# Note: ggfortify cause conflict with autoplot()
# Next do MA: Using arima p=1, q=1 (non-seasonal)
model_101 = Arima(train010_010, order=c(1,0,1))
checkresiduals(model_101)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1) with non-zero mean
## Q* = 28.259, df = 21, p-value = 0.1329
##
## Model df: 3. Total lags used: 24
tsdisplay(residuals(model_101)) # showing the PACF

# The ACF reduced. PACF also reduced, but the spike at lag 12 still exist,
# Try add a seasonal component at lag 12 (D=1)
# ggfortify::ggtsdiag(model_101)
# Ljung-Box Statistic now showed the residuals p-value > 0.05,
# do not reject the NULL Hypothesis,
# implying residual data are independently distributed.
# Note: ggfortify cause conflict with autoplot()
#Add Seasonal Components: Using arima Q=1
model_101_001 = Arima(train010_010, order=c(1,0,1), seasonal = c(0,0,1))
checkresiduals(model_101_001)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1)(0,0,1)[12] with non-zero mean
## Q* = 11.44, df = 20, p-value = 0.934
##
## Model df: 4. Total lags used: 24
tsdisplay(residuals(model_101_001)) # showing the PACF

# The spike at lag 12 dropped below the threshold.
#Add Seasonal Components: Using arima Q=2
model_101_002 = Arima(train010_010, order=c(1,0,1), seasonal = c(0,0,2))
checkresiduals(model_101_002)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1)(0,0,2)[12] with non-zero mean
## Q* = 10.414, df = 19, p-value = 0.942
##
## Model df: 5. Total lags used: 24
tsdisplay(residuals(model_101_002)) # showing the PACF

# There is no much changes to ACF and PCF, try P=1 instead.
#Add Seasonal Components: Using arima P=1, Q=1
model_101_101 = Arima(train010_010, order=c(1,0,1), seasonal = c(1,0,1))
checkresiduals(model_101_101)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1)(1,0,1)[12] with non-zero mean
## Q* = 10.447, df = 19, p-value = 0.941
##
## Model df: 5. Total lags used: 24
tsdisplay(residuals(model_101_101)) # showing the PACF

# There is no much changes to ACF and PCF.
# Therefore, the ARIMA parameters for non-stationary train set wil be
# arima (p=1, d=1, q=1) * (P=0, D=1, Q=1)
model_111_011 = Arima(train, order=c(1,1,1), seasonal = c(0,1,1))
checkresiduals(model_111_011)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(0,1,1)[12]
## Q* = 12.319, df = 21, p-value = 0.9306
##
## Model df: 3. Total lags used: 24
tsdisplay(residuals(model_111_011)) # showing the PACF

summary(model_111_011)
## Series: train
## ARIMA(1,1,1)(0,1,1)[12]
##
## Coefficients:
## ar1 ma1 sma1
## 0.2904 -0.7377 -0.6973
## s.e. 0.1623 0.1189 0.0931
##
## sigma^2 estimated as 4009: log likelihood=-665.16
## AIC=1338.32 AICc=1338.67 BIC=1349.44
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 5.546197 59.35247 44.91013 0.2261564 2.549106 0.5574281
## ACF1
## Training set -0.02994887
coeftest(model_111_011)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.290382 0.162321 1.7889 0.07363 .
## ma1 -0.737721 0.118856 -6.2069 5.405e-10 ***
## sma1 -0.697325 0.093086 -7.4912 6.827e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(model_111_011$x, col="red") ; lines(fitted(model_111_011), col="blue")

# Test against Forecast for 27 months
model_111_011 %>%
forecast(h=27) %>%
autoplot()+autolayer(test)

# Try to reduce the range of confidence interval for the forecast
# by increasing the AR by an additional seasonal order
# Therefore, the ARIMA parameters for non-stationary train set wil be
# arima (p=1, d=1, q=1) * (P=1, D=1, Q=1)
model_111_111 = Arima(train, order=c(1,1,1), seasonal = c(1,1,1))
checkresiduals(model_111_111)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(1,1,1)[12]
## Q* = 11.49, df = 20, p-value = 0.9325
##
## Model df: 4. Total lags used: 24
tsdisplay(residuals(model_111_111)) # showing the PACF

summary(model_111_111)
## Series: train
## ARIMA(1,1,1)(1,1,1)[12]
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.2953 -0.7395 0.0742 -0.7455
## s.e. 0.1625 0.1191 0.1540 0.1350
##
## sigma^2 estimated as 4021: log likelihood=-665.04
## AIC=1340.08 AICc=1340.61 BIC=1353.98
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 5.668452 59.18607 44.73699 0.2325546 2.538906 0.5552791
## ACF1
## Training set -0.03008069
coeftest(model_111_111)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.295269 0.162540 1.8166 0.06928 .
## ma1 -0.739541 0.119125 -6.2081 5.363e-10 ***
## sar1 0.074177 0.154017 0.4816 0.63008
## sma1 -0.745521 0.134959 -5.5241 3.313e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(model_111_111$x, col="red") ; lines(fitted(model_111_111), col="blue")

# Test against Forecast for 27 months
model_111_111 %>%
forecast(h=27) %>%
autoplot() + autolayer(test)

# Try auto.arima
fitautoarima = auto.arima(train)
checkresiduals(fitautoarima)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(0,1,1)[12]
## Q* = 12.319, df = 21, p-value = 0.9306
##
## Model df: 3. Total lags used: 24
tsdisplay(residuals(fitautoarima)) # showing the PACF

summary(fitautoarima)
## Series: train
## ARIMA(1,1,1)(0,1,1)[12]
##
## Coefficients:
## ar1 ma1 sma1
## 0.2904 -0.7377 -0.6973
## s.e. 0.1623 0.1189 0.0931
##
## sigma^2 estimated as 4009: log likelihood=-665.16
## AIC=1338.32 AICc=1338.67 BIC=1349.44
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 5.546197 59.35247 44.91013 0.2261564 2.549106 0.5574281
## ACF1
## Training set -0.02994887
coeftest(fitautoarima)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.290382 0.162321 1.7889 0.07363 .
## ma1 -0.737721 0.118856 -6.2069 5.405e-10 ***
## sma1 -0.697325 0.093086 -7.4912 6.827e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fitautoarima$x, col="red") ; lines(fitted(fitautoarima), col="blue")

# Result of auto.arima is ARIMA(1,1,1)(0,1,1)[12]
# Same as manually tune model_111_011
# Test against Forecast for 27 months
fitautoarima %>%
forecast(h=27) %>%
autoplot() + autolayer(test)

# Try ETS
model_ETS = ets(train)
checkresiduals(model_ETS)

##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 20.91, df = 10, p-value = 0.02173
##
## Model df: 14. Total lags used: 24
tsdisplay(residuals(model_ETS)) # showing the PACF

summary(model_ETS)
## ETS(A,N,A)
##
## Call:
## ets(y = train)
##
## Smoothing parameters:
## alpha = 0.5245
## gamma = 1e-04
##
## Initial states:
## l = 1812.8525
## s = 28.7322 -7.9188 1.5296 -123.0332 202.4818 140.0204
## 42.4109 78.5713 54.4692 45.4712 -254.9488 -207.7858
##
## sigma: 60.6351
##
## AIC AICc BIC
## 1743.417 1747.555 1786.659
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 2.079917 57.32948 45.03911 0.0288371 2.553083 0.5590291
## ACF1
## Training set 0.06755291
plot(model_ETS$x, col="red") ; lines(fitted(model_ETS), col="blue")

# Test against Forecast for 27 months
model_ETS %>%
forecast(h=27) %>%
autoplot() + autolayer(test)

# It is noted that the forecast did not track the test result closely
Ex Ante Forecast for 6 months
# Train the model based on the full set of data
modelF_111_011 = Arima(rider_ts, order = c(1,1,1), seasonal = c(0,1,1))
modelF_ETS = ets(rider_ts)
# Ex Ante Forecast for 6 months
fcst_111_011 = forecast(modelF_111_011, h=6)
fcst_ETS = forecast(modelF_ETS, h=6)
# save Ex Ante Forecast of the model in the new data frame
# together with variable you want to plot against
F_111_011 = data.frame(fcst_111_011) %>% select(-2,-3)
F_111_011$Time = dmy(paste(1, rownames(F_111_011)))
F_111_011$Model = "ARIMA_111_011"
F_111_011 = F_111_011[,c(4,5,1,2,3)]
#F_111_011 = gather(F_111_011, "Type", "Forecast", 3:5)
F_ETS = data.frame(fcst_ETS) %>% select(-2,-3)
F_ETS$Time = dmy(paste(1, rownames(F_ETS)))
F_ETS$Model = "ETS"
F_ETS = F_ETS[,c(4,5,1,2,3)]
#F_ETS = gather(F_ETS, "Type", "Forecast", 3:5)
df = rbind(F_111_011, F_ETS, deparse.level = 0)
# Extract source ts as data frame
ts = window(rider_ts, start = 2017)
ds = data.frame(ts)
names(ds) = "Point.Forecast"
ds$Time = as.Date(ts)
ds$Point.Forecast = as.numeric(ds$Point.Forecast)
ds$Model = "Actual"
ds = ds[,c(2,3,1)]
ds_df = full_join(ds,df)
## Joining, by = c("Time", "Model", "Point.Forecast")
ds_df %>% ggplot(aes(x = Time, y=Point.Forecast, col = Model)) +
geom_point() +
geom_line() +
geom_ribbon(aes(ymin=Lo.95,ymax=Hi.95, fill = Model), linetype = 0, alpha=.2) +
theme_bw() +
theme(legend.position = "bottom",
axis.text = element_text(colour = "black")
) +
labs(title="Ridership Analysis",
subtitle= "6 months forecast of Ridership",
y = "Count of Riders",
x = "Date")
